Note:

sampler.py requirements:

- sklearn (kd-tree)
- cartopy (plotting)
- matplotlib 

In [1]:
import os 
import sampler
import numpy as np

%matplotlib inline

Retrieve and parse a text product containing observations:

Retrieve and parse a text product from the Australian Bureau of Meterology.

ftp://ftp2.bom.gov.au/anon/gen/fwo/IDY03021.txt

In [2]:
obs = sampler.dataframe_observations()
print(obs.head())


        station    lat     lon  day  hour  vis  cld wind_dir wind_mag  temp  \
2  Albury AP    -36.07  146.95    1   600  NaN  NaN        W      014     7   
3  Albury AP    -36.07  146.95    1   601    9  NaN        W      014     6   
5  Armidale     -30.52  151.67    1   500   40  NaN        W      027    16   
6  Armidale A   -30.53  151.62    1   600   10  NaN      WNW      024    14   
7  Armidale A   -30.53  151.62    1   600  NaN  NaN      WNW      024    14   

   rh     bar rain_mm rain_hr weather  tx  tn  
2  86  1011.6     NaN     NaN     NaN NaN NaN  
3  86  1011.4       2      9>     NaN NaN NaN  
5  23     NaN     0.0      06    Dust  17 NaN  
6  30  1011.8     0.0      9>    Wndy NaN NaN  
7  30  1011.3     NaN     NaN    Wndy NaN NaN  

In [3]:
sampler.plot_data(obs.lat, obs.lon, obs.temp, overlay=False, S=50)


Out[3]:
<matplotlib.axes.GeoAxesSubplot at 0x10a54bbd0>

Form a fast nearest neighbour sampler (kd-tree)


In [4]:
tree = sampler.sample_tree(obs.lat.values, obs.lon.values)

Form a high-resolution lat/lon grid


In [5]:
lons, lats = np.meshgrid(np.linspace(105.5, 160.4375, 1000), 
                         np.linspace(-49.5, -4.0625, 1000))

Estimate the (whole) temperature grid


In [20]:
def estimate(tree, obs, coords):
    
    distance, obs_values, _ = sampler.nearest_neighbour(
                                        tree, 
                                        obs.temp.values, 
                                        coords[0], 
                                        coords[1], 
                                        K=10)
    
    weights = 1. / distance**2
    
    temp_vector = (np.nansum((weights * obs_values), axis=1) / 
                   np.nansum(weights, axis=1))
    
    return temp_vector, obs_values

In [21]:
%%time 

temp_vector, _values = estimate(tree, obs, (lats, lons))


CPU times: user 4.02 s, sys: 122 ms, total: 4.14 s
Wall time: 4.2 s

In [22]:
ax = sampler.plot_data(lats, lons, np.reshape(temp_vector, lons.shape))

sampler.plot_data(obs.lat, obs.lon, obs.temp, S=50, ax=ax, cbar=False)


Out[22]:
<matplotlib.axes.GeoAxesSubplot at 0x10c51a050>

Chunk the input grid


In [12]:
N = 20

lat_chunks = np.array_split(lats, N)
lon_chunks = np.array_split(lons, N)

In [13]:
%timeit estimate(tree, obs, (lat_chunks[0], lon_chunks[1]))


1 loops, best of 3: 268 ms per loop

In [14]:
from IPython.parallel import Client

c = Client(profile='default')

direct = c.direct_view()

balanced = c.load_balanced_view()


/Users/nfaggian/work/anaconda/envs/python-devel/lib/python3.3/site-packages/IPython/parallel/client/client.py:446: RuntimeWarning: 
            Controller appears to be listening on localhost, but not on this machine.
            If this is true, you should specify Client(...,sshserver='you@138.44.131.187')
            or instruct your controller to listen on an external IP.
  RuntimeWarning)

Make a remote function


In [15]:
@balanced.remote()
def parallel_estimate(tree, obs, coords):
    
    import sys; sys.path.append(notebook_path)
    import sampler
    import numpy as np
    
    distance, obs_values, _ = sampler.nearest_neighbour(
                                        tree, 
                                        obs.temp.values, 
                                        coords[0], 
                                        coords[1],
                                        K=10)
    weights = 1. / distance**2
    
    temp_vector = (np.nansum((weights * obs_values), axis=1) / 
                   np.nansum(weights, axis=1))
    
    return temp_vector

Push shared state and process chunks


In [16]:
%%time

direct.push({'notebook_path': os.getcwd()})

parallel_result = [parallel_estimate(tree, obs, x) for x in 
                       zip(lat_chunks, lon_chunks)]

temp_vector = np.c_[[x.result for x in parallel_result]]


CPU times: user 817 ms, sys: 137 ms, total: 954 ms
Wall time: 2.37 s

In [24]:
ax = sampler.plot_data(lats, lons, np.reshape(temp_vector, lons.shape))

sampler.plot_data(obs.lat, obs.lon, obs.temp, S=50, ax=ax, cbar=False)


Out[24]:
<matplotlib.axes.GeoAxesSubplot at 0x10cadb910>

In [17]: